| Jobs | 1 |
| Input files |
- /2/scratch/keaton/plague-phylogeography/workflow/logs/snippy_multi/{reads_origin}/snippy-core_{locus_name}.snps.filter0.log
- /2/scratch/keaton/plague-phylogeography/workflow/logs/snippy_multi/{reads_origin}/snippy-core_{locus_name}.snps.filter1.log
- /2/scratch/keaton/plague-phylogeography/workflow/logs/snippy_multi/{reads_origin}/snippy-core_{locus_name}.snps.filter2.log
- /2/scratch/keaton/plague-phylogeography/workflow/logs/snippy_multi/{reads_origin}/snippy-core_{locus_name}.snps.filter3.log
- /2/scratch/keaton/plague-phylogeography/workflow/logs/snippy_multi/{reads_origin}/snippy-core_{locus_name}.snps.filter4.log
- /2/scratch/keaton/plague-phylogeography/workflow/logs/snippy_multi/{reads_origin}/snippy-core_{locus_name}.snps.filter5.log
- /2/scratch/keaton/plague-phylogeography/workflow/logs/snippy_multi/{reads_origin}/snippy-core_{locus_name}.snps.filter6.log
- /2/scratch/keaton/plague-phylogeography/workflow/logs/snippy_multi/{reads_origin}/snippy-core_{locus_name}.snps.filter7.log
- /2/scratch/keaton/plague-phylogeography/workflow/logs/snippy_multi/{reads_origin}/snippy-core_{locus_name}.snps.filter8.log
- /2/scratch/keaton/plague-phylogeography/workflow/logs/snippy_multi/{reads_origin}/snippy-core_{locus_name}.snps.filter9.log
- /2/scratch/keaton/plague-phylogeography/workflow/logs/snippy_multi/{reads_origin}/snippy-core_{locus_name}.snps.filter10.log
|
| Output files |
- /2/scratch/keaton/plague-phylogeography/results/snippy_multi/{reads_origin}/missing_data_{locus_name}.snps.html
|
| Code |
|
|
|
|
| import pandas as pd
import plotly.graph_objects as go
import os
|
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33 | # Identify reads_origins is filter snp logs path
reads_origin_list = ["assembly", "sra", "local", "all"]
for origin in reads_origin_list:
if origin in snakemake.input.filter_snp_log[0]:
reads_origin = origin
break
# Identify plot_thresholds from filter snp logs path
plot_thresholds = []
for filterlog in snakemake.input.filter_snp_log:
threshold = [item.replace("filter","") for item in filterlog.split(".") if "filter" in item][0]
plot_thresholds.append(threshold)
print(plot_thresholds)
# Identify the locus from the last filter snp log
target_locus = [item.replace("snippy-core_","").split(".")[0] for item in filterlog.split("/") if "snippy-core" in item][0]
print("Target Locus: ", target_locus)
# Directories to search
project_dir = os.getcwd()
output_dir = os.path.join(project_dir, "results", "snippy_multi", reads_origin)
logs_dir = os.path.join(project_dir, "workflow", "logs", "snippy_multi", reads_origin)
# Initialize data dict
data = {
"missing_data" : [],
"singleton_variants" : [],
"parsimony_variants" : [],
"filtered_variants" : [],
}
num_samples = 0
num_sites = 0
|
|
| ## Parse Data - Snakemake
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44 | for threshold in plot_thresholds:
# search for all variants in filter_snp logs
filterlog_file = "snippy-core_{}.snps.filter{}.log".format(target_locus, threshold)
num_samples_term = "Number of samples: "
num_sites_term = "Alignment length: "
singleton_variants_term = "Total singleton sites: "
parsimony_variants_term = "Parsimony informative sites: "
filtered_variants_term = "Sites passing missing data filter: "
singleton_variants = 0
parsimon_variants = 0
filtered_variants = 0
for file in snakemake.input.filter_snp_log:
if filterlog_file in file:
data["missing_data"].append(threshold)
with open(file, "r") as logfile:
for line in logfile:
# Get number of samples
if num_samples_term in line:
num_samples = line.split(num_samples_term)[1]
# Get all sites count
if num_sites_term in line:
num_sites = line.split(num_sites_term)[1]
# Get singleton count
if singleton_variants_term in line:
singleton_variants = line.split(singleton_variants_term)[1]
singleton_variants = singleton_variants.split(" ")[0]
# Get parsimony count
if parsimony_variants_term in line:
parsimony_variants = line.split(parsimony_variants_term)[1]
parsimony_variants = parsimony_variants.split(" ")[0]
# Get filtered count
if filtered_variants_term in line:
filtered_variants = line.split(filtered_variants_term)[1].strip()
# Add parsed data to dictionary
data["singleton_variants"].append(singleton_variants)
data["parsimony_variants"].append(parsimony_variants)
data["filtered_variants"].append(filtered_variants)
print(data)
print("num_samples: ", num_samples)
print("num_sites: ", num_sites)
|
|
|
|
|
|
|
|
| ### Add Data - Singleton Variants
|
|
| fig.add_trace(
go.Scatter(
x= data["missing_data"],
y = data["singleton_variants"],
mode='lines',
name = "Singleton Variants",
line=dict(width=5),
)
)
|
|
| ### Add Data - Parsimony Informative Variants
|
|
| fig.add_trace(
go.Scatter(
x= data["missing_data"],
y = data["parsimony_variants"],
mode='lines',
name = "Parsimony Informative Variants",
line=dict(width=5),
)
)
|
|
| ### Add Data - Filtered Variants
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 | fig.add_trace(
go.Scatter(
x= data["missing_data"],
y = data["filtered_variants"],
mode='lines+markers',
name = "Filtered Variants",
marker=dict(
size=20,
line=dict(
color='DarkSlateGrey',
width=2,
)
),
line=dict(width=5),
)
)
|
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 | fig.update_layout(
template="simple_white",
width=1080,
height=720,
title=("<b>Variants Across Missing Data Site Thresholds <br> (Samples: "
+ str(num_samples)
+ ", Sites: "
+ str(num_sites)
+ ") </b>"),
title_x = 0.5,
xaxis = dict(
title = "Missing Data Threshold Per Site (%)",
tickvals = plot_thresholds,
),
yaxis_title = "Number of Variant Sites",
)
|
|
|
|
|
|
|
|
| #output_plot = os.path.join(output_dir, "missing_data.html")
output_plot = snakemake.output.plot
fig.write_html(output_plot)
|
|
|
|